{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extended Word2vec and GloVe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\thushan\\documents\\python_virtualenvs\\tensorflow_venv\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "# These are all the modules we'll be using later. Make sure you can import them\n",
    "# before proceeding further.\n",
    "%matplotlib inline\n",
    "from __future__ import print_function\n",
    "import collections\n",
    "import math\n",
    "import numpy as np\n",
    "import os\n",
    "import random\n",
    "import tensorflow as tf\n",
    "import bz2\n",
    "from matplotlib import pylab\n",
    "from six.moves import range\n",
    "from six.moves.urllib.request import urlretrieve\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.cluster import KMeans\n",
    "import nltk # standard preprocessing\n",
    "import operator # sorting items in dictionary by value\n",
    "#nltk.download() #tokenizers/punkt/PY3/english.pickle\n",
    "from math import ceil\n",
    "import csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset\n",
    "This code downloads a [dataset](http://www.evanjones.ca/software/wikipedia2text.html) consisting of several Wikipedia articles totaling up to roughly 61 megabytes. Additionally the code makes sure the file has the correct size after downloading it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found and verified wikipedia2text-extracted.txt.bz2\n"
     ]
    }
   ],
   "source": [
    "url = 'http://www.evanjones.ca/software/'\n",
    "\n",
    "def maybe_download(filename, expected_bytes):\n",
    "  \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n",
    "  if not os.path.exists(filename):\n",
    "    filename, _ = urlretrieve(url + filename, filename)\n",
    "  statinfo = os.stat(filename)\n",
    "  if statinfo.st_size == expected_bytes:\n",
    "    print('Found and verified %s' % filename)\n",
    "  else:\n",
    "    print(statinfo.st_size)\n",
    "    raise Exception(\n",
    "      'Failed to verify ' + filename + '. Can you get to it with a browser?')\n",
    "  return filename\n",
    "\n",
    "filename = maybe_download('wikipedia2text-extracted.txt.bz2', 18377035)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read Data with Preprocessing with NLTK\n",
    "Reads data as it is to a string, convert to lower-case and tokenize it using the nltk library. This code reads data in 1MB portions as processing the full text at once slows down the task and returns a list of words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading data...\n",
      "Data size 3360286\n",
      "Example words (start):  ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']\n",
      "Example words (end):  ['favorable', 'long-term', 'outcomes', 'for', 'around', 'half', 'of', 'those', 'diagnosed', 'with']\n"
     ]
    }
   ],
   "source": [
    "def read_data(filename):\n",
    "  \"\"\"\n",
    "  Extract the first file enclosed in a zip file as a list of words\n",
    "  and pre-processes it using the nltk python library\n",
    "  \"\"\"\n",
    "\n",
    "  with bz2.BZ2File(filename) as f:\n",
    "\n",
    "    data = []\n",
    "    file_size = os.stat(filename).st_size\n",
    "    chunk_size = 1024 * 1024 # reading 1 MB at a time as the dataset is moderately large\n",
    "    print('Reading data...')\n",
    "    for i in range(ceil(file_size//chunk_size)+1):\n",
    "        bytes_to_read = min(chunk_size,file_size-(i*chunk_size))\n",
    "        file_string = f.read(bytes_to_read).decode('utf-8')\n",
    "        file_string = file_string.lower()\n",
    "        # tokenizes a string to words residing in a list\n",
    "        file_string = nltk.word_tokenize(file_string)\n",
    "        data.extend(file_string)\n",
    "  return data\n",
    "\n",
    "words = read_data(filename)\n",
    "print('Data size %d' % len(words))\n",
    "token_count = len(words)\n",
    "\n",
    "print('Example words (start): ',words[:10])\n",
    "print('Example words (end): ',words[-10:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building the Dictionaries\n",
    "Builds the following. To understand each of these elements, let us also assume the text \"I like to go to school\"\n",
    "\n",
    "* `dictionary`: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})\n",
    "* `reverse_dictionary`: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}\n",
    "* `count`: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]\n",
    "* `data` : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])\n",
    "\n",
    "It also introduces an additional special token `UNK` to denote rare words to are too rare to make use of."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common words (+UNK) [['UNK', 69215], ('the', 226881), (',', 184013), ('.', 120944), ('of', 116323)]\n",
      "Sample data [1728, 9, 8, 17488, 223, 4, 5211, 4461, 26, 11637]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "vocabulary_size = 50000\n",
    "\n",
    "def build_dataset(words):\n",
    "  count = [['UNK', -1]]\n",
    "  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
    "  dictionary = dict()\n",
    "  for word, _ in count:\n",
    "    dictionary[word] = len(dictionary)\n",
    "  data = list()\n",
    "  unk_count = 0\n",
    "  for word in words:\n",
    "    if word in dictionary:\n",
    "      index = dictionary[word]\n",
    "    else:\n",
    "      index = 0  # dictionary['UNK']\n",
    "      unk_count = unk_count + 1\n",
    "    data.append(index)\n",
    "  count[0][1] = unk_count\n",
    "  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) \n",
    "  assert len(dictionary) == vocabulary_size\n",
    "  return data, count, dictionary, reverse_dictionary\n",
    "\n",
    "data, count, dictionary, reverse_dictionary = build_dataset(words)\n",
    "print('Most common words (+UNK)', count[:5])\n",
    "print('Sample data', data[:10])\n",
    "del words  # Hint to reduce memory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data: ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed']\n",
      "\n",
      "with window_size = 1:\n",
      "    batch: ['is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at']\n",
      "    labels: [['propaganda', 'a'], ['is', 'concerted'], ['a', 'set'], ['concerted', 'of'], ['set', 'messages'], ['of', 'aimed'], ['messages', 'at'], ['aimed', 'influencing']]\n",
      "\n",
      "with window_size = 2:\n",
      "    batch: ['a', 'concerted', 'set', 'of', 'messages', 'aimed', 'at', 'influencing']\n",
      "    labels: [['propaganda', 'is', 'concerted', 'set'], ['is', 'a', 'set', 'of'], ['a', 'concerted', 'of', 'messages'], ['concerted', 'set', 'messages', 'aimed'], ['set', 'of', 'aimed', 'at'], ['of', 'messages', 'at', 'influencing'], ['messages', 'aimed', 'influencing', 'the'], ['aimed', 'at', 'the', 'opinions']]\n"
     ]
    }
   ],
   "source": [
    "data_index = 0\n",
    "\n",
    "def generate_batch(batch_size, window_size):\n",
    "  global data_index\n",
    "\n",
    "  # two numpy arras to hold target words (batch)\n",
    "  # and context words (labels)\n",
    "  # Note that the labels array has 2*window_size columns\n",
    "  batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
    "  labels = np.ndarray(shape=(batch_size, 2*window_size), dtype=np.int32)\n",
    "  \n",
    "  # span defines the total window size, where\n",
    "  # data we consider at an instance looks as follows. \n",
    "  # [ skip_window target skip_window ]\n",
    "  span = 2 * window_size + 1 # [ skip_window target skip_window ]\n",
    "  \n",
    "  buffer = collections.deque(maxlen=span)\n",
    "  \n",
    "  # Fill the buffer and update the data_index\n",
    "  for _ in range(span):\n",
    "    buffer.append(data[data_index])\n",
    "    data_index = (data_index + 1) % len(data)\n",
    "  \n",
    "  # for a full length of batch size, we do the following\n",
    "  # make the target word the i th input word (i th row of batch)\n",
    "  # make all the context words the columns of labels array\n",
    "  # Update the data index and the buffer \n",
    "  for i in range(batch_size):\n",
    "    batch[i] = buffer[window_size]\n",
    "    labels[i, :] = [buffer[span_idx] for span_idx in list(range(0,window_size))+ list(range(window_size+1,span))]\n",
    "    buffer.append(data[data_index])\n",
    "    data_index = (data_index + 1) % len(data)\n",
    "  \n",
    "  return batch, labels\n",
    "\n",
    "print('data:', [reverse_dictionary[di] for di in data[:8]])\n",
    "\n",
    "for window_size in [1,2]:\n",
    "    data_index = 0\n",
    "    batch, labels = generate_batch(batch_size=8, window_size=window_size)\n",
    "    print('\\nwith window_size = %d:' % window_size)\n",
    "    print('    batch:', [reverse_dictionary[bi] for bi in batch])\n",
    "    print('    labels:', [[reverse_dictionary[li] for li in lbls] for lbls in labels])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Structured Skip-Gram Algorithm\n",
    "The basic idea behind the structured skip-gram algorithm is to pay attention to the position of the context words during learning. Giving the algorithm the power to distinguish between words falling very close to the target word and the ones that fall far away from the context words allow the structured skip-gram model to learn better word vectors ([Paper](http://www.cs.cmu.edu/~lingwang/papers/naacl2015.pdf)). You can learn about this algorithm in more detail in Chapter 4 text."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Defining Hyperparameters\n",
    "\n",
    "Here we define several hyperparameters including `batch_size` (amount of samples in a single batch) `embedding_size` (size of embedding vectors) `window_size` (context window size)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size = 128 # Data points in a single batch\n",
    "embedding_size = 128 # Dimension of the embedding vector.\n",
    "window_size = 2 # How many words to consider left and right.\n",
    "\n",
    "# We pick a random validation set to sample nearest neighbors\n",
    "valid_size = 16 # Random set of words to evaluate similarity on.\n",
    "# We sample valid datapoints randomly from a large window without always being deterministic\n",
    "valid_window = 50\n",
    "\n",
    "# When selecting valid examples, we select some of the most frequent words as well as\n",
    "# some moderately rare words as well\n",
    "valid_examples = np.array(random.sample(range(valid_window), valid_size))\n",
    "valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)\n",
    "\n",
    "num_sampled = 32 # Number of negative examples to sample."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Defining Inputs and Outputs\n",
    "\n",
    "Here we define placeholders for feeding in training inputs and outputs (each of size `batch_size`) and a constant tensor to contain validation examples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "\n",
    "# Training input data (target word IDs).\n",
    "train_dataset = tf.placeholder(tf.int32, shape=[batch_size])\n",
    "# Training input label data (context word IDs)\n",
    "train_labels = [tf.placeholder(tf.int32, shape=[batch_size, 1]) for _ in range(2*window_size)]\n",
    "# Validation input data, we don't need a placeholder\n",
    "# as we have already defined the IDs of the words selected\n",
    "# as validation data\n",
    "valid_dataset = tf.constant(valid_examples, dtype=tf.int32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Defining Model Parameters and Other Variables\n",
    "We now define several TensorFlow variables such as an embedding layer (`embeddings`) and neural network parameters (`softmax_weights` and `softmax_biases`). Note that the softmax weights is `2*window_size` larger than the original skip-gram algorithms's softmax weights."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embeddings = tf.Variable(\n",
    "tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "softmax_weights = [tf.Variable(\n",
    "tf.truncated_normal([vocabulary_size, embedding_size],\n",
    "                     stddev=0.5 / math.sqrt(embedding_size))) for _ in range(2*window_size)]\n",
    "softmax_biases = [tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01)) for _ in range(2*window_size)]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Defining the Model Computations\n",
    "\n",
    "We first defing a lookup function to fetch the corresponding embedding vectors for a set of given inputs. With that, we define negative sampling loss function `tf.nn.sampled_softmax_loss` which takes in the embedding vectors and previously defined neural network parameters."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\users\\thushan\\documents\\python_virtualenvs\\tensorflow_venv\\lib\\site-packages\\tensorflow\\python\\ops\\nn_impl.py:1344: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "\n",
      "Future major versions of TensorFlow will allow gradients to flow\n",
      "into the labels input on backprop by default.\n",
      "\n",
      "See @{tf.nn.softmax_cross_entropy_with_logits_v2}.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Model.\n",
    "# Look up embeddings for inputs.\n",
    "embed = tf.nn.embedding_lookup(embeddings, train_dataset)\n",
    "\n",
    "# You might see the warning when running the line below\n",
    "# WARNING:tensorflow:From c:\\...\\lib\\site-packages\\tensorflow\\python\\ops\\nn_impl.py:1346: \n",
    "#softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and \n",
    "# will be removed in a future version.\n",
    "# This is due to the sampled_softmax_loss function using a deprecated function internally\n",
    "# therefore, this is not an error in the code and you can ignore this error\n",
    "\n",
    "# Compute the softmax loss, using a sample of the negative labels each time.\n",
    "loss = tf.reduce_sum(\n",
    "[\n",
    "    tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weights[wi], biases=softmax_biases[wi], inputs=embed,\n",
    "                           labels=train_labels[wi], num_sampled=num_sampled, num_classes=vocabulary_size))\n",
    "    for wi in range(window_size*2)\n",
    "]\n",
    ")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculating Word Similarities \n",
    "We calculate the similarity between two given words in terms of the cosine distance. To do this efficiently we use matrix operations to do so, as shown below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Compute the similarity between minibatch examples and all embeddings.\n",
    "# We use the cosine distance:\n",
    "norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))\n",
    "normalized_embeddings = embeddings / norm\n",
    "valid_embeddings = tf.nn.embedding_lookup(\n",
    "normalized_embeddings, valid_dataset)\n",
    "similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Parameter Optimizer\n",
    "\n",
    "We then define a constant learning rate and an optimizer which uses the Adagrad method. Feel free to experiment with other optimizers listed [here](https://www.tensorflow.org/api_guides/python/train)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "# Optimizer.\n",
    "optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Running the Structured Skip-gram Algorithm\n",
    "\n",
    "Here we run the structured skip-gram algorithm we defined above. Specifically, we first initialize variables, and then train the algorithm for many steps (`num_steps`). And every few steps we evaluate the algorithm on a fixed validation set and print out the words that appear to be closest for a given set of words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Average loss at step 2000: 14.825290\n",
      "Average loss at step 4000: 12.924444\n",
      "Average loss at step 6000: 12.492212\n",
      "Average loss at step 8000: 12.194010\n",
      "Average loss at step 10000: 11.985265\n",
      "Nearest to .: ;, ,, of, :, and, reclassify, '', in,\n",
      "Nearest to which: but, that, who, and, it, what, where, then,\n",
      "Nearest to an: a, the, chong, its, constrained, rockwell, spartan, cigars,\n",
      "Nearest to as: creating, by, 29.9, kunda, ravens, diracodon, including, attractive,\n",
      "Nearest to be: been, have, being, daly, was, spectroscopic, often, were,\n",
      "Nearest to first: last, second, next, only, same, main, original, late,\n",
      "Nearest to ,: ;, ., (, and, :, of, —, ''protecteur,\n",
      "Nearest to (: ;, ,, na+, 30.1, travis, :, per, dram,\n",
      "Nearest to from: in, into, across, by, at, resident, lcd, between,\n",
      "Nearest to for: soo, of, over, follower, bien, among, inequalities, introductions,\n",
      "Nearest to ;: ., ,, :, (, —, ..., magill, one-man,\n",
      "Nearest to have: had, has, were, are, be, 7-6, year.the, make,\n",
      "Nearest to UNK: artificially, postulated, disasters, cooling, tselinograd, indefinite, enthusiastic, mass-marketed,\n",
      "Nearest to or: and, centred, hematoma, allowing, preeminent, than, resident, tubulin,\n",
      "Nearest to :: ;, ., dexter, (, stallion, methodologies, ,, une,\n",
      "Nearest to the: a, its, their, his, any, this, an, another,\n",
      "Average loss at step 12000: 11.888148\n",
      "Average loss at step 14000: 11.728705\n",
      "Average loss at step 16000: 11.675454\n",
      "Average loss at step 18000: 11.627067\n",
      "Average loss at step 20000: 11.579372\n",
      "Nearest to .: ;, ,, :, —, and, of, (, pear-shaped,\n",
      "Nearest to which: that, but, who, where, and, can, this, ecstasy,\n",
      "Nearest to an: a, the, rockwell, irwin, spartan, reclassify, oyo, corrugated,\n",
      "Nearest to as: kunda, called, yorkville, ordinances, attractive, creating, diracodon, revitalized,\n",
      "Nearest to be: been, being, have, become, were, write, amnh, clearly,\n",
      "Nearest to first: last, second, next, only, final, original, drury, main,\n",
      "Nearest to ,: ;, ., —, -, :, (, and, turkish-cypriot,\n",
      "Nearest to (: ;, -, ,, —, or, per, travis, :,\n",
      "Nearest to from: across, into, in, by, between, towards, through, on,\n",
      "Nearest to for: during, bien, yamaha, with, within, after, soo, among,\n",
      "Nearest to ;: ., ,, :, —, (, consumes, -, censured,\n",
      "Nearest to have: had, has, were, be, having, are, martyred, preserve,\n",
      "Nearest to UNK: tselinograd, 10,000, 1835., 4, quilting, r, jewellery, deadline,\n",
      "Nearest to or: and, (, buddhist, hematoma, centred, preeminent, retain, angélil,\n",
      "Nearest to :: ., ;, ,, —, pan-slavic, consumes, resorts, reunify,\n",
      "Nearest to the: its, a, their, his, an, her, typing, paroled,\n",
      "Average loss at step 22000: 11.470342\n",
      "Average loss at step 24000: 11.460939\n",
      "Average loss at step 26000: 11.296547\n",
      "Average loss at step 28000: 10.891410\n",
      "Average loss at step 30000: 10.739479\n",
      "Nearest to .: ;, ,, alfa, —, --, outcroppings, conventions, shaved,\n",
      "Nearest to which: who, that, but, and, where, whom, shoppers, repeal,\n",
      "Nearest to an: spartan, rockwell, pablo, cheadle, novgorodians, irwin, resnick, corrugated,\n",
      "Nearest to as: creating, yorkville, kunda, sài, f-75, including, retains, rossini,\n",
      "Nearest to be: been, being, become, spectroscopic, clearly, remain, write, kieft,\n",
      "Nearest to first: last, next, second, final, best, only, third, highest,\n",
      "Nearest to ,: ;, ., —, -, hazardous, –, ''protecteur, inflicted,\n",
      "Nearest to (: -, na+, travis, =, quis, 4-d, preemptive, sarawak,\n",
      "Nearest to from: across, in, longevity, overcome, snoop, via, missile, panicked,\n",
      "Nearest to for: bien, water…and, nasty, hectare, of, goryeo, keller, −300,\n",
      "Nearest to ;: ., ,, —, -, --, one-man, :, mucous,\n",
      "Nearest to have: had, has, having, rely, are, western-style, year.the, were,\n",
      "Nearest to UNK: blue, tarnów, monkees, tselinograd, silent, 1.1, artificially, 300,\n",
      "Nearest to or: and, preeminent, formatted, landmass, langston, morton, erysipelas, tubulin,\n",
      "Nearest to :: dexter, freshest, une, -, cowdery, ;, include, aerostatic/aerodynamic,\n",
      "Nearest to the: its, a, his, their, harriet, debbie, our, tranquilizer,\n",
      "Average loss at step 32000: 10.743503\n",
      "Average loss at step 34000: 10.741862\n",
      "Average loss at step 36000: 10.708315\n",
      "Average loss at step 38000: 10.610916\n",
      "Average loss at step 40000: 10.676803\n",
      "Nearest to .: ;, ,, :, and, 1932–33, harvested, cornelius, kalmykova,\n",
      "Nearest to which: that, who, whom, but, what, where, repeal, ecstasy,\n",
      "Nearest to an: rockwell, sarai, constrained, irwin, resnick, open-spandrel, spartan, corrugated,\n",
      "Nearest to as: yorkville, self-esteem, kunda, attractive, |mar_lo_°c, ravens, f-75, disperse,\n",
      "Nearest to be: been, being, fully, remain, amnh, have, was, clearly,\n",
      "Nearest to first: last, second, next, earliest, only, final, original, best,\n",
      "Nearest to ,: ;, ., —, and, -, ''protecteur, shipboard, –,\n",
      "Nearest to (: -, —, dram, approximately, =, 30.1, –, na+,\n",
      "Nearest to from: into, across, collects, lcd, documenting, mastiff, deep-seated, accommodating,\n",
      "Nearest to for: nasty, soo, introductions, in, water…and, yamaha, among, tumwater,\n",
      "Nearest to ;: ,, ., —, superfluous, petitioner, pro-russian, complains, fund-raising,\n",
      "Nearest to have: had, has, are, having, contain, were, martyred, apply,\n",
      "Nearest to UNK: r, tselinograd, perch, zha, re-instated, eighth, 300, speculates,\n",
      "Nearest to or: and, somebody, formatted, nor, dat, preeminent, 4-5, landmass,\n",
      "Nearest to :: dexter, consumes, word, providers, stallion, differentiating, pan-slavic, .,\n",
      "Nearest to the: a, their, your, his, its, any, zaidi, generalfeldmarschall,\n",
      "Average loss at step 42000: 10.604452\n",
      "Average loss at step 44000: 10.669711\n",
      "Average loss at step 46000: 10.638800\n",
      "Average loss at step 48000: 10.602861\n",
      "Average loss at step 50000: 10.685731\n",
      "Nearest to .: ;, ,, :, —, photographing, interviewing, shias, in,\n",
      "Nearest to which: that, and, but, where, what, whom, who, ecstasy,\n",
      "Nearest to an: rockwell, corrugated, reclassify, boise, irwin, novgorodians, resnick, the,\n",
      "Nearest to as: self-esteem, kunda, triploid, attractive, ravens, |mar_lo_°c, racquet, yorkville,\n",
      "Nearest to be: been, being, easily, replace, surpass, remain, solve, readily,\n",
      "Nearest to first: last, next, second, earliest, final, fourth, third, only,\n",
      "Nearest to ,: —, ;, ., (, and, in, djurgårdens, shipboard,\n",
      "Nearest to (: —, -, dram, ,, –, ''hancock, approximately, ;,\n",
      "Nearest to from: into, in, through, lcd, across, sault, liaison, towards,\n",
      "Nearest to for: nasty, yamaha, introductions, soo, during, arbitrarily, bien, in,\n",
      "Nearest to ;: ., ,, —, :, -, consumes, (, than,\n",
      "Nearest to have: had, has, having, are, were, contain, martyred, rely,\n",
      "Nearest to UNK: hawkeye, silent, tselinograd, brown, non-living, aesthetics, d, here,\n",
      "Nearest to or: and, formosan, dat, desc, nor, preeminent, containing, formatted,\n",
      "Nearest to :: ., ;, differentiating, consumes, resorts, dexter, cowdery, methodologies,\n",
      "Nearest to the: a, its, their, this, horsetails, his, acelhuate, delagoa,\n",
      "Average loss at step 52000: 10.430200\n",
      "Average loss at step 54000: 10.324997\n",
      "Average loss at step 56000: 10.216399\n",
      "Average loss at step 58000: 10.217039\n",
      "Average loss at step 60000: 10.210400\n",
      "Nearest to .: ,, ;, albinus, of, :, ?, 'big, matsui,\n",
      "Nearest to which: that, who, whom, what, shoppers, but, sheikh, repeal,\n",
      "Nearest to an: rockwell, resnick, spartan, open-spandrel, kant, irwin, corrugated, takings,\n",
      "Nearest to as: ravens, self-esteem, blaming, sài, beginning, result, creating, attractive,\n",
      "Nearest to be: been, have, being, easily, clearly, fully, replace, grow,\n",
      "Nearest to first: last, second, earliest, next, only, final, best, original,\n",
      "Nearest to ,: ., ;, —, theobromine, -, ''protecteur, cabled, :,\n",
      "Nearest to (: [, -, 405, bernard, adventurers, dram, horace, 30.1,\n",
      "Nearest to from: jawbone, metrovick, overcome, lcd, replacing, across, into, in,\n",
      "Nearest to for: introductions, seeker, spion, reactor, nasty, smelting, bien, rehabilitated,\n",
      "Nearest to ;: ., ,, —, khaldun, prowess, -, avellaneda, :,\n",
      "Nearest to have: has, had, be, having, dumps, rely, apply, contain,\n",
      "Nearest to UNK: tselinograd, 1800, re-instated, -1, fostered, tarnów, r., cobo,\n",
      "Nearest to or: and, formatted, landmass, centred, meaning, preeminent, hematoma, reciting,\n",
      "Nearest to :: termed, dexter, providers, freshest, ., teufel, stallion, nickname,\n",
      "Nearest to the: a, glial, blackburn, our, appease, 'the, atheistic, various,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average loss at step 62000: 10.223157\n",
      "Average loss at step 64000: 10.105503\n",
      "Average loss at step 66000: 10.191790\n",
      "Average loss at step 68000: 10.157220\n",
      "Average loss at step 70000: 10.154481\n",
      "Nearest to .: ,, ;, of, and, in, that, verbiage, rostand,\n",
      "Nearest to which: that, who, and, ecstasy, whom, but, repeal, whose,\n",
      "Nearest to an: rockwell, resnick, spartan, irwin, sarai, cavitation, boise, novgorodians,\n",
      "Nearest to as: ravens, blaming, sài, kunda, rossini, medial, continuous-wave, result,\n",
      "Nearest to be: been, being, customisation, easily, replace, surpass, occur, were,\n",
      "Nearest to first: last, second, next, earliest, fourth, final, only, oldest,\n",
      "Nearest to ,: ., ;, and, —, of, langevin, recuperating, assaulted,\n",
      "Nearest to (: -, 1187., wander, ;, holmgard, —, eucharistic, mib,\n",
      "Nearest to from: in, towards, lcd, longevity, accommodating, accra, into, rampa,\n",
      "Nearest to for: yamaha, introductions, bien, nasty, during, fucking, gourmet, dislodge,\n",
      "Nearest to ;: ., ,, pro-russian, —, (, consumes, ?, :,\n",
      "Nearest to have: had, has, were, are, having, rely, brutish, be,\n",
      "Nearest to UNK: silent, hellene, weak, berger, hardiness, headingley, bone, 39,\n",
      "Nearest to or: and, formatted, preeminent, sax, pre-s2, reciting, baleen, buddhist,\n",
      "Nearest to :: reunify, termed, differentiating, replicates, consumes, liberalised, teufel, ;,\n",
      "Nearest to the: its, a, their, glial, these, 1937–1945, this, his,\n",
      "Average loss at step 72000: 10.217530\n",
      "Average loss at step 74000: 10.146726\n",
      "Average loss at step 76000: 10.247005\n",
      "Average loss at step 78000: 10.026597\n",
      "Average loss at step 80000: 9.882595\n",
      "Nearest to .: ;, ,, 1924., 10., 2003., 2006., 2004., 1983.,\n",
      "Nearest to which: that, whom, who, 35.6, where, shoppers, roney, but,\n",
      "Nearest to an: rockwell, resnick, irwin, corrugated, novgorodians, spartan, cheadle, sarai,\n",
      "Nearest to as: yorkville, quintessentially, |mar_lo_°c, sài, self-esteem, escapes, mississippians, thessaly,\n",
      "Nearest to be: been, being, surpass, easily, customisation, replace, deliberately, occur,\n",
      "Nearest to first: last, second, next, fourth, only, earliest, final, oldest,\n",
      "Nearest to ,: ., ;, and, melinda, refuelling, —, apostate, hunslet,\n",
      "Nearest to (: -, na+, dram, chanute, indented, lihue, 4-d, approximately,\n",
      "Nearest to from: in, lampboard, deep-seated, waterways, across, israeli-palestinian, cambodian, lcd,\n",
      "Nearest to for: water…and, nasty, concealing, γαλαξίας, yamaha, bien, keller, kopfstein,\n",
      "Nearest to ;: ., ,, deco, --, —, one-man, penned, mucous,\n",
      "Nearest to have: had, has, having, rely, contain, year.the, were, spend,\n",
      "Nearest to UNK: unsuited, 1.1, 99, tarnów, tselinograd, schlich, monkees, natasha,\n",
      "Nearest to or: formatted, and, preeminent, plus, landmass, pre-s2, semivowels, lukewarm,\n",
      "Nearest to :: differentiating, une, termed, freshest, aerostatic/aerodynamic, dexter, dragged, rhinos,\n",
      "Nearest to the: a, its, non-agricultural, forster, an, shawnee, tanzanian, paroled,\n",
      "Average loss at step 82000: 9.922878\n",
      "Average loss at step 84000: 9.897537\n",
      "Average loss at step 86000: 9.913045\n",
      "Average loss at step 88000: 9.824237\n",
      "Average loss at step 90000: 9.811843\n",
      "Nearest to .: ,, ;, jazzy, 2001., bethad, 1821., supertankers, align=,\n",
      "Nearest to which: that, who, whom, what, but, shoppers, where, and,\n",
      "Nearest to an: rockwell, resnick, sarai, corrugated, thane, open-spandrel, fended, entremeses,\n",
      "Nearest to as: self-esteem, |mar_lo_°c, thelma, triploid, kunda, ravens, yorkville, quintessentially,\n",
      "Nearest to be: been, being, surpass, become, fully, grow, regain, remain,\n",
      "Nearest to first: second, last, earliest, next, fourth, oldest, final, facultatively,\n",
      "Nearest to ,: ., ;, —, posit, and, -, 802.11b, of,\n",
      "Nearest to (: -, —, 30.1, teddy, investment-grade, 'scouse, <, 405,\n",
      "Nearest to from: lampboard, into, mastiff, in, deep-seated, fasi, lcd, alchemical,\n",
      "Nearest to for: yamaha, soo, introductions, nasty, water…and, electrical, fattened, reactor,\n",
      "Nearest to ;: ,, ., —, complains, superfluous, pro-russian, scorers, transliterations,\n",
      "Nearest to have: had, has, exist, contain, having, are, contribute, represent,\n",
      "Nearest to UNK: re-instated, reuters, loftus, -1, gaston, multi-instrumentalist, harvard, tarnów,\n",
      "Nearest to or: formatted, and, landmass, 4-5, lampooned, buddhist, preeminent, nor,\n",
      "Nearest to :: freshest, dexter, providers, differentiating, actor-managers, stanislaus, retorted, aerostatic/aerodynamic,\n",
      "Nearest to the: a, his, paroled, its, newly-created, delagoa, stormtrooper, woda,\n",
      "Average loss at step 92000: 9.857963\n",
      "Average loss at step 94000: 9.855468\n",
      "Average loss at step 96000: 9.892065\n",
      "Average loss at step 98000: 9.858063\n",
      "Average loss at step 100000: 9.912151\n",
      "Nearest to .: ;, ,, pear-shaped, of, d'etat, :, and, seaways,\n",
      "Nearest to which: that, whom, what, where, who, alushta, redfin, 35.6,\n",
      "Nearest to an: rockwell, corrugated, resnick, irwin, boise, 40.4, novgorodians, thane,\n",
      "Nearest to as: kunda, triploid, yorkville, racquet, quintessentially, ravens, self-esteem, result,\n",
      "Nearest to be: been, being, surpass, replace, easily, was, formally, partially,\n",
      "Nearest to first: last, second, earliest, next, fourth, oldest, final, best,\n",
      "Nearest to ,: ., ;, —, ''protecteur, and, diphthongisation, chokai, ultimatetv,\n",
      "Nearest to (: —, dram, bernard, -, 30.1, –, sarawak, toray,\n",
      "Nearest to from: lampboard, phosphorus, rossby, lighter-than-air, lcd, wendt, alchemical, longevity,\n",
      "Nearest to for: yamaha, introductions, nasty, freest, seeker, water…and, mistook, reminded,\n",
      "Nearest to ;: ., ,, :, durant, --, —, basel-landschaft, >,\n",
      "Nearest to have: has, had, contain, having, spend, contribute, rely, 've,\n",
      "Nearest to UNK: silent, darts, tselinograd, 4th, berger, jewellery, honour, claudius,\n",
      "Nearest to or: formatted, and, slew, desc, sax, preeminent, pre-s2, meaning,\n",
      "Nearest to :: differentiating, ;, consumes, pour, freshest, mattila, termed, recounting,\n",
      "Nearest to the: a, his, its, their, 1959-1960, this, species-rich, 1.88,\n"
     ]
    }
   ],
   "source": [
    "num_steps = 100001\n",
    "decay_learning_rate_every = 2000\n",
    "skip_gram_loss = [] # Collect the sequential loss values for plotting purposes\n",
    "\n",
    "with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as session:\n",
    "  tf.global_variables_initializer().run()\n",
    "  print('Initialized')\n",
    "  average_loss = 0\n",
    "  for step in range(num_steps):\n",
    "    batch_data, batch_labels = generate_batch(\n",
    "      batch_size, window_size)\n",
    "    feed_dict = {train_dataset : batch_data}\n",
    "    for wi in range(2*window_size):\n",
    "        feed_dict.update({train_labels[wi]:np.reshape(batch_labels[:,wi],(-1,1))})\n",
    "    \n",
    "    _, l = session.run([optimizer, loss], feed_dict=feed_dict)\n",
    "    average_loss += l\n",
    "    \n",
    "    if (step+1) % 2000 == 0:\n",
    "      if step > 0:\n",
    "        average_loss = average_loss / 2000\n",
    "      # The average loss is an estimate of the loss over the last 2000 batches.\n",
    "      print('Average loss at step %d: %f' % (step+1, average_loss))\n",
    "      skip_gram_loss.append(average_loss)\n",
    "      average_loss = 0\n",
    "    # note that this is expensive (~20% slowdown if computed every 500 steps)\n",
    "    if (step+1) % 10000 == 0:\n",
    "      sim = similarity.eval()\n",
    "      for i in range(valid_size):\n",
    "        valid_word = reverse_dictionary[valid_examples[i]]\n",
    "        top_k = 8 # number of nearest neighbors\n",
    "        nearest = (-sim[i, :]).argsort()[1:top_k+1]\n",
    "        log = 'Nearest to %s:' % valid_word\n",
    "        for k in range(top_k):\n",
    "          close_word = reverse_dictionary[nearest[k]]\n",
    "          log = '%s %s,' % (log, close_word)\n",
    "        print(log)\n",
    "  skip_gram_final_embeddings = normalized_embeddings.eval()\n",
    "\n",
    "# We will save the word vectors learned and the loss over time\n",
    "# as this information is required later for comparisons\n",
    "np.save('struct_skip_embeddings',skip_gram_final_embeddings)\n",
    "\n",
    "with open('struct_skip_losses.csv', 'wt') as f:\n",
    "    writer = csv.writer(f, delimiter=',')\n",
    "    writer.writerow(skip_gram_loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
